


************* draw sample and create batches for human coders
use "cluster1.dta", clear
append using "cluster2.dta"
append using "cluster3.dta"
append using "cluster4.dta"
append using "cluster5.dta"

* clean
drop if page == 2015
drop if outlet == "REPLIK TILL: Bättre att ha jobb att gå till än att få bidrag"
drop if outlet == `"Reflektioner till HD-artikel "Varje bil behöver tre parkeringsplatser""'
drop if outlet == "Bröllop Idag gifter vi oss. Marita Björk och Pär Maltesson. Torsby"

* drop perfect duplicates
duplicates drop headline text date outlet, force
duplicates drop id, force

* remove overly short / long before drawing coding sample
gen length_headline = strlen(headline)
drop if length_headline == 1
drop if length_headline > 111

gen length_text = strlen(text)
drop if length_text < 92 | length_text > 4739

* create 50 batches of 500 random articles each
keep id page length_text headline text

set seed 2803
sample 25000, count

gen double random = runiform()
sort random
egen sampleid = seq(), block(500)
drop random

save "temp.dta", replace


levelsof sampleid, local(levels) 
foreach l of local levels {
	use "temp.dta", clear
	keep if sampleid == `l'
	export excel id page length_text headline text using "batch_`l'.xls", replace firstrow(variables)
	}

erase "temp.dta"

	
	
	
***** get articles for prediction part I
********************************************************************************
* get IDs of annotated articles
cd ..
cd "prepare training data"
use "all_ids.dta", clear

keep id
cd ..
cd "raw articles"
gen n = 1
save "ids_temp.dta", replace


* prepare file for predictions in Colab
use "cluster1.dta", clear
append using "cluster2.dta"
append using "cluster3.dta"
append using "cluster4.dta"
append using "cluster5.dta"

* clean
drop if page == 2015
drop if outlet == "REPLIK TILL: Bättre att ha jobb att gå till än att få bidrag"
drop if outlet == `"Reflektioner till HD-artikel "Varje bil behöver tre parkeringsplatser""'
drop if outlet == "Bröllop Idag gifter vi oss. Marita Björk och Pär Maltesson. Torsby"

* drop perfect duplicates
duplicates drop headline text date outlet, force
duplicates drop id, force

keep id headline text

* remove annotated articles
merge 1:1 id using "ids_temp.dta"
drop if _merge == 3
drop _merge n

*set seed 2803
*sample 1000, count

gen headl_text = headline + " " + text
drop if missing(headl_text)

keep id headl_text
sort id, stable
keep if _n <= 1052211

export delimited using "all_articles_for_pred_1.csv", replace

erase "ids_temp.dta"



***** get articles for prediction part II
********************************************************************************
* get IDs of annotated articles
cd ..
cd "prepare training data"
use "all_ids.dta", clear

keep id
cd ..
cd "raw articles"
gen n = 1
save "ids_temp.dta", replace


* prepare file for predictions in Colab
use "cluster1.dta", clear
append using "cluster2.dta"
append using "cluster3.dta"
append using "cluster4.dta"
append using "cluster5.dta"

* clean
drop if page == 2015
drop if outlet == "REPLIK TILL: Bättre att ha jobb att gå till än att få bidrag"
drop if outlet == `"Reflektioner till HD-artikel "Varje bil behöver tre parkeringsplatser""'
drop if outlet == "Bröllop Idag gifter vi oss. Marita Björk och Pär Maltesson. Torsby"

* drop perfect duplicates
duplicates drop headline text date outlet, force
duplicates drop id, force

keep id headline text

* remove annotated articles
merge 1:1 id using "ids_temp.dta"
drop if _merge == 3
drop _merge n

*set seed 2803
*sample 1000, count

gen headl_text = headline + " " + text
drop if missing(headl_text)

keep id headl_text
sort id, stable
keep if _n > 1052211

export delimited using "all_articles_for_pred_2.csv", replace

erase "ids_temp.dta"











